library(tidyverse)
library(knitr)
library(DT)Challenge 9: Baby Names
Setup
names_a <- read_csv(here::here('supporting_artifacts', 'StateNames_A.csv'))Dataset Preview
datatable(names_a, class = 'cell-border stripe')Warning in instance$preRenderHook(instance): It seems your data is too big
for client-side DataTables. You may consider server-side processing: https://
rstudio.github.io/DT/server.html
3 Summarizing and Visualizing
1.
names_allison <- names_a |>
filter(
Name == 'Allison'
) |>
group_by(
State,
Gender
) |>
summarize(sum = sum(Count)) |>
pivot_wider(
names_from = Gender,
values_from = sum
) |>
mutate(
M = replace_na(M, 0)
) |>
rename(
"Number of Female Babies" = `F`,
"Number of Male Babies" = M
)`summarise()` has grouped output by 'State'. You can override using the
`.groups` argument.
kable(names_allison, 'pipe')| State | Number of Female Babies | Number of Male Babies |
|---|---|---|
| AK | 232 | 0 |
| AL | 1535 | 0 |
| AR | 1198 | 0 |
| AZ | 1880 | 0 |
| CA | 12413 | 0 |
| CO | 1594 | 0 |
| CT | 1099 | 0 |
| DC | 321 | 0 |
| DE | 294 | 0 |
| FL | 4455 | 0 |
| GA | 3257 | 0 |
| HI | 183 | 0 |
| IA | 1477 | 0 |
| ID | 451 | 0 |
| IL | 5110 | 0 |
| IN | 3067 | 0 |
| KS | 1283 | 0 |
| KY | 1905 | 20 |
| LA | 1209 | 0 |
| MA | 2218 | 0 |
| MD | 2229 | 0 |
| ME | 340 | 0 |
| MI | 4014 | 0 |
| MN | 2374 | 0 |
| MO | 2882 | 0 |
| MS | 817 | 0 |
| MT | 226 | 0 |
| NC | 3435 | 0 |
| ND | 285 | 0 |
| NE | 807 | 0 |
| NH | 412 | 0 |
| NJ | 3052 | 0 |
| NM | 399 | 0 |
| NV | 729 | 0 |
| NY | 5747 | 0 |
| OH | 5487 | 0 |
| OK | 1421 | 0 |
| OR | 1186 | 0 |
| PA | 4307 | 0 |
| RI | 306 | 0 |
| SC | 1228 | 0 |
| SD | 376 | 0 |
| TN | 2488 | 0 |
| TX | 10192 | 0 |
| UT | 1125 | 0 |
| VA | 3220 | 0 |
| VT | 135 | 0 |
| WA | 1956 | 0 |
| WI | 2367 | 0 |
| WV | 813 | 0 |
| WY | 142 | 0 |
datatable(names_allison, class = 'cell-border stripe')2.
names_allison <- names_allison |>
select(
State,
`Number of Female Babies`
)3.
allison_years <- names_a |>
filter(
Name == 'Allison'
) |>
group_by(
Year
) |>
summarize(
sum = sum(Count)
)
allison_years |> ggplot(
mapping = aes(x = Year, y = sum)
) +
geom_line() +
labs(
y = element_blank(),
title = 'Frequency of Babies Named "Allison"'
)4 Modeling the Number of Allisons
4.
allison_model <- allison_years |>
lm(
sum ~ Year,
data = _
)5.
allison_model |>
ggplot(
mapping = aes(x = Year, y = sum)
) +
geom_point() +
stat_smooth(method = 'lm') +
labs(
y = 'Number of Babies Named Allison'
)`geom_smooth()` using formula = 'y ~ x'
6.
predicted countallison = 209815.1 - 101.6(Year)
7.
allison_model |>
broom::augment() |>
ggplot(
mapping = aes(y = .resid, x = .fitted)
) +
geom_point() +
labs(
x = 'Predicted',
title = 'Residual Plot',
y = 'Residuals'
) +
geom_hline(
yintercept = 0,
linetype = 'dashed'
)There are no clear trends in the residual plot but there is a group of predicted counts from around 5900 to 6600 that were all underestimated.
8.
Your name is declining in coolness/popularity.
5 Spelling by State
1.
names_allan <- names_a |>
filter(
Name == 'Allan' | Name == 'Alan' | Name == 'Allen'
)2.
names_allan <- names_allan |>
filter(
State == 'CA' | State == 'PA',
Year == '2000'
) |>
group_by(
State,
Name
) |>
summarize(
sum = sum(Count)
) |>
pivot_wider(
names_from = Name,
values_from = sum
) |>
rename(
'Frequency of "Alan"' = Alan,
'Frequency of "Allan"' = Allan,
'Frequency of "Allen"' = Allen
)`summarise()` has grouped output by 'State'. You can override using the
`.groups` argument.
kable(names_allan, 'pipe')| State | Frequency of “Alan” | Frequency of “Allan” | Frequency of “Allen” |
|---|---|---|---|
| CA | 584 | 131 | 176 |
| PA | 51 | 12 | 56 |
3.
names_allan_pct <- names_allan |>
mutate(
total = sum(
`Frequency of "Alan"`,
`Frequency of "Allan"`,
`Frequency of "Allen"`
),
`Percent "Alan"` = (`Frequency of "Alan"` / total) * 100,
`Percent "Allan"` = (`Frequency of "Allan"` / total) * 100,
`Percent "Allen"` = (`Frequency of "Allen"` / total) * 100
) |>
select(
-total,
-`Frequency of "Alan"`,
-`Frequency of "Allan"`,
-`Frequency of "Allen"`
)
kable(names_allan_pct, 'pipe')| State | Percent “Alan” | Percent “Allan” | Percent “Allen” |
|---|---|---|---|
| CA | 65.54433 | 14.70258 | 19.75309 |
| PA | 42.85714 | 10.08403 | 47.05882 |
In CA, 65.54% were named Alan, 14.7% were named Allan, and 19.75% were named Allen. In PA, 42.86% were named Alan, 10.08% were named Allan, and 47.06% were named Allen.